from datetime import datetime
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from utils import utils, plotting
%matplotlib inline
# Settings
pd.options.display.max_columns = 200
pd.options.display.max_rows = 50000
sns.set(font_scale=1.2)
px.set_mapbox_access_token(utils.get_token())
reviews = pd.read_csv('data/reviews.csv')
reviews.info()
listings = pd.read_pickle('data/listings_clean.pkl')
listings.info()
# Rename columns to simpler names
listings.rename(columns={'host_is_superhost': 'by_superhost',
'neighbourhood_cleansed': 'neighborhood',
'review_scores_rating': 'score_rtg',
'review_scores_accuracy': 'score_accu',
'review_scores_cleanliness': 'score_clean',
'review_scores_checkin': 'score_check',
'review_scores_communication': 'score_comm',
'review_scores_location': 'score_loc',
'review_scores_value': 'score_val'}, inplace=True)
# Cast `by_superhost` as integer type
listings['by_superhost'] = listings.by_superhost.astype(int)
listings.head(2)
list765 = listings[listings.price <= 765].copy()
list765.describe()
# Price distribution for listings by regular hosts and superhosts
plotting.hist_with_hue(df=list765, col='price', bins=range(0, 770, 10))
# Descriptive statistics of price
utils.desc_byhost(df=list765, col='price')
# Rating distribution of listings by regular hosts and superhosts
plotting.hist_with_hue(df=list765, col='score_rtg', bins=32)
# Descriptive statistics of rating
utils.desc_byhost(df=list765, col='score_rtg')
fig, ax = plt.subplots(1, 2, figsize=(16, 4))
# Rating distribution in relative terms
plotting.hist_pct(df=list765, ax=ax[0], filter_val=0, title='Regular Hosts', xlab='Rating Interval', ylab='Percent of Ratings')
plotting.hist_pct(df=list765, ax=ax[1], filter_val=1, title='Superhosts', xlab='Rating Interval')
# 2-sample t-test
reghost = list765[list765.by_superhost == 0].copy()
suphost = list765[list765.by_superhost == 1].copy()
stats.ttest_ind(reghost.score_rtg.dropna(), suphost.score_rtg.dropna(), equal_var=False)
# Price vs overall rating
sns.jointplot(data=list765, x='price', y='score_rtg');
score_vars = ['price', 'score_rtg', 'score_accu', 'score_clean', 'score_check', 'score_comm', 'score_loc', 'score_val']
# Price vs different rating types
plotting.scatter_subplots(df=list765, var_list=score_vars)
# Correlation
plotting.corr_heatmap(df=list765, var_list=score_vars, show_spearman=True, palette='viridis_r');
# Downsample high ratings
downsampled_rtg = utils.sample_byinterval(df=list765, col='score_rtg', lower=60, upper=100, step=10)
downsampled_rtg.score_rtg.hist()
# Price vs overall rating
sns.jointplot(data=downsampled_rtg, x='price', y='score_rtg');
# Correlation of sample
plotting.corr_heatmap(df=downsampled_rtg, var_list=score_vars, show_spearman=True, palette='viridis_r');
price and the location score, and after downsampling the high ratings, between the price and the cleanliness score. But other than these weak correlations, there is no relationship between rating and price.¶# Extract listings with an overall rating of less than 30
low = list765[list765.score_rtg < 30].id.tolist()
bad = reviews[reviews.listing_id.isin(low)].dropna(subset=['comments'])
len(low), len(bad)
# Extract listings with an overall rating of 100
high = list765[list765.score_rtg == 100].id.tolist()
good = reviews[reviews.listing_id.isin(high)].dropna(subset=['comments'])
len(high), len(good)
# Aggregate the bad reviews
bad_reviews = ' '.join(bad.comments.tolist())
# Aggregate the good reviews
good_reviews = ' '.join(good.comments.tolist())
# Approximate word count of each
len(bad_reviews.split(' ')), len(good_reviews.split(' '))
plt.figure(figsize=(20, 10))
# Bad reviews
plt.subplot(1, 2, 1)
plotting.wordcloud(text=bad_reviews, bg_color='black')
# Good reviews
plt.subplot(1, 2, 2)
plotting.wordcloud(text=good_reviews)
# Stopword list
words = 'place stay Airbnb house apartment home day us booked said know staying something going wa Karla Jane John Ryan \
ha visit made unit one will even LA trip night didn got went time hour room please property manager Nickey \
another told week say someone go address thing people first though space hotel paid use book booking still \
pay later take friend hi Los Angeles Santa Monica cottage feel Hollywood make way well trying look came guest'
stopwords = set(STOPWORDS)
stopwords.update(words.split(' '))
plt.figure(figsize=(16, 16))
# Bad Reviews
plt.subplot(2, 1, 1)
plotting.wordcloud(text=bad_reviews, bg_color='black', stopwords=stopwords)
# Good Reviews
plt.subplot(2, 1, 2)
plotting.wordcloud(text=good_reviews, stopwords=stopwords)
# Extract listings with at least 50 reviews and an overall rating of 100
rating100 = list765[(list765.number_of_reviews >= 50) & (list765.score_rtg == 100)].copy()
# Calculate the average listing price of these listings by neighborhood
price100 = utils.agg_to_2cols(df=rating100, agg_col='price', agg_col_name='Price')
# Count the number of listings by neighborhood and select neighborhoods where the count is at least 10
ratings = utils.agg_to_2cols(df=rating100, agg_col='score_rtg', agg_col_name='Num_properties', agg_by_mean=False)
ratings = ratings[ratings.Num_properties >= 10]
# Neighborhoods with at least 10 listings that have at least 50 reviews and an overall rating of 100
plt.figure(figsize=(10, 5))
plotting.labeled_barplot(df=ratings, x='Num_properties', y='Neighborhood',
title='Neighborhoods with Maximum-rated Listings and Their Average Listing Price', upper=35)
# Show average price of each neighborhood
for i in range(11):
n = ratings.iloc[i, 1]
lab = price100.loc[price100.Neighborhood == ratings.loc[i, 'Neighborhood'], 'Price'].values[0]
plt.text(35.5, i, f'Avg price: ${lab:.02f}', va='center')
# Neighborhoods with at least 10 listings that have at least 50 reviews and an overall rating of 100
fig = plotting.choropleth_map(df=ratings, color_col='Num_properties')
fig.show()
# Neighborhoods with at least 50 listings
nb50 = utils.agg_to_2cols(df=list765, agg_col='price', agg_col_name='Price', agg_by_mean=False)
nb50 = nb50[nb50.Price >= 50]
# Average listing price by neighborhood
prices = list765[list765.neighborhood.isin(nb50.Neighborhood)]
prices = utils.agg_to_2cols(df=prices, agg_col='price', agg_col_name='Avg_price')
# Average listing price of neighborhoods with at least 50 listings
fig = plotting.choropleth_map(df=prices, color_col='Avg_price', center_lat=34.20, center_lon=-118.25,
zoom=8.5, height=800, colorbar_range=(0, 300))
fig.show()
# Top and bottom listings in average price
plt.figure(figsize=(12, 6))
plotting.labeled_barplot(df=pd.concat([prices.iloc[:10], prices.iloc[-10:]]), x='Avg_price', y='Neighborhood',
title='10 Most Expensive and 10 Cheapest Neighborhoods', upper=400)
# 30 most expensive and 30 cheapest neighborhoods
fig = plotting.choropleth_map(df=pd.concat([prices.iloc[:30], prices.iloc[-30:]]), color_col='Avg_price', center_lat=34.20, center_lon=-118.25,
zoom=8.5, height=800, colorbar_range=(0, 300))
fig.show()
listings.to_pickle('data/listings2.pkl')
pd.read_pickle('data/listings2.pkl').info()
listings.to_csv('data/listings2.csv')
pd.read_csv('data/listings2.csv').info()